DPI = 100
RATE = 22050
def dict_eq(d, key, value=None):
if key not in d.keys():
return False
elif value is None:
return True if d[key] else False
else:
return d[key] == value
def configure_time_axis(ax, num, duration):
def xformat(value, pos):
return '%.2f' % (value / num * duration)
ax.xaxis.set_major_formatter(plt.FuncFormatter(xformat))
ax.set_xlabel('Time (seconds)')
def configure_freq_axis(ax, num_bins):
fmin, fmax = 125, 7600 # from Tacotron2 hparams.py
mel_to_f = lambda mel : 700 * (np.exp(mel / 1127) - 1)
f_to_mel = lambda f : 1127 * np.log(1 + f / 700)
lower_mel = f_to_mel(fmin)
upper_mel = f_to_mel(fmax)
def yformat(value, pos):
# value = 1 + value # num_bins - 1 - value
step = (upper_mel - lower_mel) / num_bins
mel = lower_mel + value * step
return '%.2f' % mel_to_f(mel)
ax.yaxis.set_major_formatter(plt.FuncFormatter(yformat))
ax.set_ylabel('Frequency (Hz)')
def plot_mel(fig, ax, title, mel):
num = len(mel)
duration = num * audio.get_hop_size() / RATE
configure_time_axis(ax, num, duration)
configure_freq_axis(ax, mel.shape[1])
ax.set_title(title)
im = ax.imshow(np.swapaxes(mel, 0, 1), origin='lower', cmap='coolwarm', aspect='auto')
fig.colorbar(im, ax=ax)
def display_wave(title, wave):
fig, ax = plt.subplots(figsize=(8,2), dpi=100)
ax.set_ylim(-1, 1)
ax.plot(wave)
configure_time_axis(ax, len(wave), len(wave) / RATE)
fig.suptitle(title, fontsize=14, y=1.1)
plt.tight_layout()
plt.show()
display(Audio(wave, rate=RATE))
def display_learned_mel(props, mel, wave=None):
# fig, axes = plt.subplots(2, figsize=(16,4), dpi=DPI)
# print(props)
mel_zeros = dict_eq(props, 'mel_zeros')
wave_zeros = dict_eq(props, 'wave_zeros')
mel0 = np.load('../Tacotron-2/tacotron_output/eval/speech-mel-00001.npy')
mel0 = np.interp(mel0, (0, 4), (0, 1))
if mel_zeros:
mel0 = np.zeros(mel0.shape)
conv_id = props['conv_id']
mode = 'skip' if props['mode'] == 'skip' else 'residual'
channel_ids = str(props['channel_ids'])[1:-1]
s = 's' if len(props['channel_ids']) > 1 else ''
obj = 'layer %d, %s output, channel%s\n%s' % (conv_id, mode, s, channel_ids)
fig, axes = plt.subplots(1, 2, figsize=(8,4), dpi=DPI)
if mel_zeros:
title0 = 'Initial mel-spectrogram (zeros)'
else:
title0 = 'Initial mel-spectrogram\n(generated by Tacotron2)'
title1 = 'Mel-spectrogram trained to maximize\n%s' % obj
plot_mel(fig, axes[0], title0, mel0)
plot_mel(fig, axes[1], title1, mel)
wave_mode = 'zeros' if wave_zeros else 'speech'
title = 'Maximizing activation objective with respect to\n'
title += 'local conditioning features (wave input is %s)' % wave_mode
fig.suptitle(title, fontsize=18, y=1.15)
plt.subplots_adjust(wspace=0.5)
plt.tight_layout()
plt.show()
if wave is not None:
display_wave('Audio generated by WaveNet', wave)
'''fig, ax = plt.subplots(figsize=(8,2), dpi=100)
ax.plot(wave)
configure_time_axis(ax, len(wave), len(wave) / RATE)
fig.suptitle('Audio generated by WaveNet', fontsize=14, y=1.1)
plt.tight_layout()
plt.show()
display(Audio(wave, rate=RATE))'''
def sigmoid(x):
return 1 / (1 + np.exp(-x))
paths = os.listdir()
paths = list(filter(lambda path : path.endswith('.pickle'), paths))
paths = sorted(paths, key=lambda path : int(path.split('.')[0]), reverse=True)
for path in paths:
props, mel = pickle.load(open(path, 'rb'))
wave_path = path + '.wav'
wave = audio.load_wav(wave_path) if os.path.exists(wave_path) else None
if dict_eq(props, 'sigmoid'):
mel = sigmoid(mel)
display_learned_mel(props, mel, wave)